Motors for Bayesian Models

Features of the grid approximation

  • compute posterior at each grid point
  • grid spacing determines the precision of the approximation
  • simple to compute
  • limited to 5-D by computational costs
  • robust method in the presence of multiple modes

Recap of the Grid Approximation

A. Five-step protocol in one dimension.

  1. Define a grid
p_grid <-seq(from=0,to=1,length.out=100)
  1. Define the prior
prior <-rep(1,100)/sum(rep(1,100))
sum(prior)
## [1] 1
  1. Compute the likelihood at each value in grid
likelihood <- dbinom(6,size=9,prob=p_grid)
sum(likelihood)
## [1] 9.900001
  1. Compute product of likelihood and prior
unstd.posterior <-likelihood*prior
  1. Standardize the posterior, so it sums to 1.
posterior <-unstd.posterior/sum(unstd.posterior)
sum(posterior)
## [1] 1
plot( p_grid, posterior, 
      type="b",
      xlab="Probability of water",
      ylab="Posterior probability")
mtext( "100 points")

p_grid <-seq(from=0,to=1,length.out=100)
prior <-rep(1,100)/sum(rep(1,100))
sum(prior)
## [1] 1
likelihood <-dbinom(6,size=9,prob=p_grid)
sum(likelihood)
## [1] 9.900001
unstd.posterior <-likelihood*prior
posterior <-unstd.posterior/sum(unstd.posterior)
sum(posterior)
## [1] 1
plot( p_grid, posterior, 
      type="b",
      xlab="Probability of water",
      ylab="Posterior probability")
mtext( "100 points")

Repeat with Step prior

sprior <- ifelse(p_grid < 0.5, 0, 1)/sum(ifelse(p_grid < 0.5, 0, 1))
sum(sprior)
## [1] 1
slikelihood <-dbinom(6,size=9,prob=p_grid)
unstd.sposterior <-slikelihood*sprior
sposterior <-unstd.sposterior/sum(unstd.sposterior)
sum(sposterior)
## [1] 1
plot( p_grid, sposterior, 
      type="b",
      xlab="Probability of water",
      ylab="Posterior probability")
points(p_grid, sprior, col='magenta')
points(p_grid, posterior, col='cyan')
mtext( "Step prior, 100 grid points")

legend(x = "topright",
       c("sposterior", "sprior", "posterior"),
       cex=.8,
       col=c("black","magenta","cyan"),
       lwd = 2,
       bty = "n"
       )  

Example of a multimodal distribution

bimodal

Quadratic approximation: Pierre Simon Laplace

  • France’s Issac Newton.
  • Proved the general form of the Central Limit Theorem.
  • Proved a Bayesian interpretation of linear least-squares.
  • Developed the Laplace transform.
  • Developed spherical harmonics.

Pierre Simon LaPlace

A Philosophical Eassy on Probability

Eassy on Probability

Source: Internet Archive

There is a newer translation by Andrew I. Dale in 1998 and 2012 published by Springer. It is suppose to be easier to read than the 1902 version and there is commentary on the mathematical notation.

Andrew Dale also wrote

Dale, A.I. (2012) A history of inverse probability: From Thomas Bayes to Karl Pearson Springer Science & Business Media.

and

Dale, A.I. (2003) Most Honourable Remembrance: The Life and Work of Thomas Bayes. Springer.

McGrayne’s book

McGrayne’s book

Compute the quadratic approxmatiation using the quap function from the rethinking library

library(rethinking)
## Loading required package: rstan
## Loading required package: StanHeaders
## Loading required package: ggplot2
## rstan (Version 2.21.5, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## Loading required package: cmdstanr
## This is cmdstanr version 0.5.2
## - CmdStanR documentation and vignettes: mc-stan.org/cmdstanr
## - CmdStan path: /Users/blaine/.cmdstan/cmdstan-2.30.1
## - CmdStan version: 2.30.1
## Loading required package: parallel
## rethinking (Version 2.21)
## 
## Attaching package: 'rethinking'
## The following object is masked from 'package:rstan':
## 
##     stan
## The following object is masked from 'package:stats':
## 
##     rstudent
globe.qa <- quap(
  alist(
     W ~ dbinom(W+L, p), # binomial likelihood
     p ~ dunif(0,1)   # uniform prior
  ),
  data=list(W=6, L=3) )

# display summary of quadratic approximation
precis(globe.qa)
# analytical calculation
W <-6
L <-3
curve( dbeta(x,W+1,L+1),from=0,to=1)
# quadraticapproximation
curve( dnorm(x,0.67,0.16),lty=2,add=TRUE)

Now double the amount of data

globe.qa <- quap(
  alist(
     W ~ dbinom(W+L, p), # binomial likelihood
     p ~ dunif(0,1)   # uniform prior
  ),
  data=list(W=12, L=6) )

# display summary of quadratic approximation
precis(globe.qa)
# analytical calculation
W <-12
L <-6
curve( dbeta(x,W+1,L+1),from=0,to=1)
# quadraticapproximation
curve( dnorm(x,0.67,0.11),lty=2,add=TRUE)

Now double the data again:

globe.qa <- quap(
  alist(
     W ~ dbinom(W+L, p), # binomial likelihood
     p ~ dunif(0,1)   # uniform prior
  ),
  data=list(W=24, L=12) )

# display summary of quadratic approximation
precis(globe.qa)

Plug the new sd into the quadratic approximation formula. We finally get more reasonable approximation.

# analytical calculation
W <-24
L <-12
curve( dbeta(x,W+1,L+1),from=0,to=1)
# quadraticapproximation
curve( dnorm(x,0.67,0.08),lty=2,add=TRUE)

Now double the data again:

globe.qa <- quap(
  alist(
     W ~ dbinom(W+L, p), # binomial likelihood
     p ~ dunif(0,1)   # uniform prior
  ),
  data=list(W=48, L=24) )

# display summary of quadratic approximation
precis(globe.qa)
# analytical calculation
W <-48
L <-24
curve( dbeta(x,W+1,L+1),from=0,to=1)
# quadraticapproximation
curve( dnorm(x,0.67,0.06),lty=2,add=TRUE)

Caveats about the Quadration Approximation

  • The fit generally improves with more data.
  • Sometimes, it can remain a poor approximation even with 1000s of data points.
  • With a lot of data or an uniform prior, this approach is equavilent to a Maximum Likelihood Estimate (MLE).
  • When equivalent to MLE, can reinterpret the model fits in terms of a Bayesian analysis.
  • Shares the drawbacks of MLEs.
  • Computing a Hessian is required to get the standard deviation. This computation can go wrong.

Markov Chain Monte Carlo (MCMC)

Multi-level or hierarchical models are common and have large numbers of parameters. Grid approximation and Quadratic approximation are often not adquate for large models.

MCMC merely draws samples from the posterior rather than computing the posterior directly. The frequencies of the parameter values of these samples is proportional to their posterior plausability. The histogram of these samples gives a picture of the posterior. So with MCMC, we work with samples rather then with an estimate of the posterior distribution.

MCMC orginiated with the need to estimate high dimensional integrals

2-D example

Source: Sobol, I. M. (1973). Numerical Monte Carlo Methods. Nauka, Moscow.

The ratio of the number of points inside the boundary and outside give the proportion of the square occupied by the irregular area.

Determine area of irregular shape #### The bounding area could be another curve

Source: Robert, C. P., Casella, G., and Casella, G. (2010). Introducing Monte Carlo methods with R. New York: Springer.

Determine area under aribtrary curve

Originated at Los Alamos in the late 1940s during work on the H-bomb

\[\begin{equation} P(\Delta E)=e^{-\Delta E / k T} \end{equation}\]

  1. Make a move (e.g., select a sample, change a torsion angle, translate the ligand, rotate the ligand)

  2. Calculate the energy (\(E\))

  3. Compare E to prior \(E^{\circ}\)

    • If \(\Delta E < 0\), accept the move
    • If \(\Delta E > 0\), accept move if a randomly selected number between 0 and 1 is less then \(P\). Otherwise, reject the move.

Source: Metropolis, Nicholas; Rosenbluth, Arianna W.; Rosenbluth, Marshall N.; Teller, Augusta H.; and Teller, Edward (1953) Equation of state Calculations by Fast Computing Machines. The Journal of Chemical Physics. 21(6):1087.

MCMC applied to generating samples from the posterior for the globe tossing problem

Example using the Metropolis algorithm with 1000 samples.

n_samples <-1000
p <-rep(NA,n_samples)
p[1] <- 0.5
W <-6
L <-3
for (i in 2:n_samples){
    p_new <-rnorm(1,p[i-1],0.1)
    if (p_new<0) p_new <- abs(p_new)
    if (p_new>1) p_new <- 2 - p_new
    q0 <- dbinom(W,W+L,p[i-1])
    q1 <- dbinom(W,W+L,p_new)
    p[i] <- ifelse(runif(1)<q1/q0,p_new,p[i-1])
}
dens( p, xlim=c(0,1))
curve( dbeta(x,W+1,L+1),lty=2,add=TRUE)

Example using the Metropolis algorithm with 10,000 samples.

n_samples <-10000
p <-rep(NA,n_samples)
p[1] <- 0.5
W <-6
L <-3
for (i in 2:n_samples){
    p_new <-rnorm(1,p[i-1],0.1)
    if (p_new<0) p_new <- abs(p_new)
    if (p_new>1) p_new <- 2 - p_new
    q0 <- dbinom(W,W+L,p[i-1])
    q1 <- dbinom(W,W+L,p_new)
    p[i] <- ifelse(runif(1)<q1/q0,p_new,p[i-1])
}
dens( p, xlim=c(0,1))
curve( dbeta(x,W+1,L+1),lty=2,add=TRUE)

Example with 100,000 samples

n_samples <-100000
p <-rep(NA,n_samples)
p[1] <- 0.5
W <-6
L <-3
for (i in 2:n_samples){
    p_new <-rnorm(1,p[i-1],0.1)
    if (p_new<0) p_new <- abs(p_new)
    if (p_new>1) p_new <- 2 - p_new
    q0 <- dbinom(W,W+L,p[i-1])
    q1 <- dbinom(W,W+L,p_new)
    p[i] <- ifelse(runif(1)<q1/q0,p_new,p[i-1])
}
dens( p, xlim=c(0,1))
curve( dbeta(x,W+1,L+1),lty=2,add=TRUE)

Example with 1,000,000 samples

n_samples <-1000000
p <-rep(NA,n_samples)
p[1] <- 0.5
W <-6
L <-3
for (i in 2:n_samples){
    p_new <-rnorm(1,p[i-1],0.1)
    if (p_new<0) p_new <- abs(p_new)
    if (p_new>1) p_new <- 2 - p_new
    q0 <- dbinom(W,W+L,p[i-1])
    q1 <- dbinom(W,W+L,p_new)
    p[i] <- ifelse(runif(1)<q1/q0,p_new,p[i-1])
}
dens( p, xlim=c(0,1))
curve( dbeta(x,W+1,L+1),lty=2,add=TRUE)